Implement a scalable PageRank algorithm on the Google web graph dataset Google.txt
. Use a random teleporting probability of 0.2. Each line in the file has two numbers separated by a space. This represents a directed edge in the graph. For example, "0 824020" is an edge from node 0 to node 824020. Calculate the PageRank value of node '99'.
In [1]:
from scipy.sparse import coo_matrix
import numpy as np
In [2]:
nodesID = {}
n = 0
line_count = 0
In [3]:
with open('Google.txt','r') as google:
for line in google:
line_count += 1
if line.startswith('#') : continue
tokens = line.strip().split('\t')
#print tokens
if tokens[0] not in nodesID:
nodesID[tokens[0]] = n
n += 1
if tokens[1] not in nodesID:
nodesID[tokens[1]] = n
n += 1
print len(nodesID)
print n
In [4]:
col = []
row = []
value = []
In [5]:
line_count = 0
with open('Google.txt','r') as google:
for line in google:
line_count += 1
if line.startswith('#') : continue
tokens = line.strip().split('\t')
url1 = nodesID[tokens[0]]
url2 = nodesID[tokens[1]]
col.append(url1)
row.append(url2)
value.append(1.0)
In [6]:
M = coo_matrix((value, (row,col)), shape=(n, n))
print M
print M.shape
In [7]:
inLink = M.sum(1)
print inLink
In [8]:
outLink = M.sum(0).T
print outLink
print outLink.shape
In [9]:
value = [1.0 / outLink[col[i], 0] for i, v in enumerate(value)]
In [10]:
M = coo_matrix((value, (row,col)), shape=(n, n))
print M
print M.shape
In [11]:
beta= 0.8
epsilon = 1./(10**11)
r = np.ones([n,1])
r = r/n
print np.sum(r)
In [12]:
for _ in xrange(250):
old_r = r
r = beta * M * r
for j in xrange(n):
if inLink[j,0] == 0 :
r[j] = 0
S = r.sum()
r = r + (1 - S)/n
if np.sum(np.abs(old_r - r)) < epsilon:
print "{} iterations".format(_)
old_r = r
break
else:
old_r = r
print 'Pagerank("99"):', r[nodesID['99']]
In [ ]: